library(MASS) # provides the parcoord() function that automatically builds parallel coordinates chart
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.3.2 ✔ purrr 0.3.2
## ✔ tibble 3.0.2 ✔ dplyr 1.0.0
## ✔ tidyr 0.8.3 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## Warning: package 'ggplot2' was built under R version 3.6.2
## Warning: package 'tibble' was built under R version 3.6.2
## Warning: package 'dplyr' was built under R version 3.6.2
## ── Conflicts ────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::select() masks MASS::select()
library(GGally)
## Warning: package 'GGally' was built under R version 3.6.2
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
# import the GBIF data dump
gbif_us_dataset <- read_csv("../gbif_institutionCode_summmary/gbif_us_institutionCode_MIDS_2020.07.08.csv")
## Parsed with column specification:
## cols(
## institutionCode = col_character(),
## total = col_double(),
## has_collectionCode = col_double(),
## has_catalogNumber = col_double(),
## has_speciesKey = col_double(),
## has_scientificName = col_double(),
## has_acceptedNameUsage = col_double(),
## has_locality = col_double(),
## has_higherGeography = col_double(),
## has_countryCode = col_double(),
## has_coordinates = col_double(),
## has_image = col_double(),
## has_dateIdentified = col_double(),
## has_identifiedBy = col_double(),
## has_recordedBy = col_double(),
## has_eventDate = col_double()
## )
# Sort columns based on MIDS level
gbif_us_dataset <- gbif_us_dataset %>% relocate(has_catalogNumber, has_collectionCode, has_countryCode, has_speciesKey, has_locality, has_eventDate, has_recordedBy, has_coordinates, has_image, has_identifiedBy, has_dateIdentified, has_higherGeography)
# Filter dataset for easier viewing
top_5_gbif_us_dataset <- gbif_us_dataset %>% top_n(5, total)
bottom_5_gbif_us_dataset <- gbif_us_dataset %>% top_n(-5, total)
# Generate percentage of totals as data
percentage_gbif_us_dataset <- gbif_us_dataset[, 1:12] %>%
mutate(across(everything()), . / gbif_us_dataset$total)
# Filter percentages
top_5_percentage_gbif_us_dataset <- top_5_gbif_us_dataset[, 1:12] %>%
mutate(across(everything()), . / top_5_gbif_us_dataset$total)
bottom_5_percentage_gbif_us_dataset <- bottom_5_gbif_us_dataset[, 1:12] %>%
mutate(across(everything()), . / bottom_5_gbif_us_dataset$total)
## NULL
The above plot shows the “all or nothing” approach to digitization of terms (i.e., up and down pattern seen all over the plot). Institutions have either close to 0 or close to 100% coverage for their data per term.
mass_parcoord_plot <- generate_mass_parcoord_plot(top_5_gbif_us_dataset, top_5_percentage_gbif_us_dataset)
mass_parcoord_plot
## NULL
Again, it is apparent that different collections use terms differently or place emphasis on digitizing different pieces of information (“all or nothing” use of terms).
mass_parcoord_plot <- generate_mass_parcoord_plot(bottom_5_gbif_us_dataset, bottom_5_percentage_gbif_us_dataset)
mass_parcoord_plot
## NULL
This plot is hard to decipher due to the data (i.e., when two 1 data points exist side by side, the line is not drawn, and similarly for two 0 data points). Need to fix, if this plot proves useful.
I can’t figure out why the above plot is totally messed up… the above 1 data point and below -1 data point don’t exist in the data.. need to investigate further.
# NEED TO CLEAN THIS CODE UP
gbif_us_dataset <- read_csv("../gbif_institutionCode_summmary/gbif_us_institutionCode_MIDS_2020.07.08.csv")
## Parsed with column specification:
## cols(
## institutionCode = col_character(),
## total = col_double(),
## has_collectionCode = col_double(),
## has_catalogNumber = col_double(),
## has_speciesKey = col_double(),
## has_scientificName = col_double(),
## has_acceptedNameUsage = col_double(),
## has_locality = col_double(),
## has_higherGeography = col_double(),
## has_countryCode = col_double(),
## has_coordinates = col_double(),
## has_image = col_double(),
## has_dateIdentified = col_double(),
## has_identifiedBy = col_double(),
## has_recordedBy = col_double(),
## has_eventDate = col_double()
## )
generate_ggally_parcoord_plot <- function(gbif_us_dataset) {
# Sort columns based on MIDS level
gbif_us_dataset <- gbif_us_dataset %>% relocate(has_catalogNumber, has_collectionCode, has_countryCode, has_speciesKey, has_locality, has_eventDate, has_recordedBy, has_coordinates, has_image, has_identifiedBy, has_dateIdentified, has_higherGeography)
# Generate percentage of totals as data
percentage_gbif_us_dataset <- gbif_us_dataset[, 1:12] %>%
mutate(across(everything()), . / gbif_us_dataset$total)
# add back in the institutionCode
percentage_gbif_us_dataset <- cbind(percentage_gbif_us_dataset, gbif_us_dataset$institutionCode)
colnames(percentage_gbif_us_dataset)[13] <- "institutionCode"
parcordd_plot <- ggparcoord(data = percentage_gbif_us_dataset, columns = 1:12, scale = "uniminmax", alphaLines = 0.05) + theme_bw()
return(parcordd_plot)
}
ggally_parcordd_plot <- generate_ggally_parcoord_plot(gbif_us_dataset)
ggally_parcordd_plot
“Up and down” pattern seen again.